#Do we have missing data?
bech %>%
mutate(across(.cols = -c(imdb,code,binary), ~as.numeric(.x))) %>%
na.omit(.)
## # A tibble: 1,484 x 10
## year imdb budget domgross intgross code budget_2013 domgross_2013
## <dbl> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 1974 tt0071562 13000000 57300000 57300000 1974PASS 61408439 270669505
## 2 1982 tt0084516 10700000 74706019 121706019 1982PASS 25821968 180285645
## 3 2008 tt0800241 15000000 2203641 6379575 2008PASS 16233845 2384904
## 4 2011 tt1625346 12000000 16311571 22750356 2011PASS 12428289 16893744
## 5 2000 tt0190590 26000000 45506619 75763814 2000FAIL 35175618 61566286
## 6 2009 tt1232207 20000000 14363397 19121531 2009FAIL 21714632 15594794
## 7 2007 tt0449088 300000000 309420425 960996492 2007PASS 337063045 347647302
## 8 2009 tt0875034 80000000 19676965 53508858 2009FAIL 86858528 21363903
## 9 2005 tt0369994 2000000 2072645 2077844 2005PASS 2386066 2472734
## 10 2002 tt0283139 16000000 16357770 21657770 2002PASS 20722867 21186244
## # ... with 1,474 more rows, and 2 more variables: intgross_2013 <dbl>,
## # binary <chr>
#1,484 rows have no NA or missing values. Drop the others.
bech = bech %>%
mutate(across(.cols = -c(imdb,code,binary), ~as.numeric(.x))) %>%
na.omit(.)
bech %>%
count(binary)
## # A tibble: 2 x 2
## binary n
## <chr> <int>
## 1 FAIL 820
## 2 PASS 664
#820 FAILS, 664 PASSES.
#Summary.
summary(bech)
## year imdb budget domgross
## Min. :1970 Length:1484 Min. : 7000 Min. : 828
## 1st Qu.:1998 Class :character 1st Qu.: 12000000 1st Qu.: 16244304
## Median :2005 Mode :character Median : 29000000 Median : 42704878
## Mean :2003 Mean : 45555379 Mean : 70440928
## 3rd Qu.:2009 3rd Qu.: 63000000 3rd Qu.: 95052518
## Max. :2013 Max. :425000000 Max. :760507625
## intgross code budget_2013 domgross_2013
## Min. :8.280e+02 Length:1484 Min. : 8632 Min. :8.990e+02
## 1st Qu.:2.583e+07 Class :character 1st Qu.: 16143738 1st Qu.:2.022e+07
## Median :7.875e+07 Mode :character Median : 36995786 Median :5.608e+07
## Mean :1.545e+08 Mean : 56311299 Mean :9.719e+07
## 3rd Qu.:1.986e+08 3rd Qu.: 81684071 3rd Qu.:1.255e+08
## Max. :2.784e+09 Max. :461435929 Max. :1.772e+09
## intgross_2013 binary
## Min. :8.990e+02 Length:1484
## 1st Qu.:3.278e+07 Class :character
## Median :9.933e+07 Mode :character
## Mean :2.037e+08
## 3rd Qu.:2.467e+08
## Max. :3.172e+09
#Simple bar plot for pass/fail.
ggplotly(
bech %>%
count(binary) %>%
mutate(bech_percent = n / sum(n)) %>%
ggplot() +
geom_col(aes(x = binary, y = bech_percent, fill = binary)) +
scale_y_continuous(limits = c(0, 1),
labels = label_percent()) +
labs(y = "Bechdel Test Proportion (%)",
x = "Bechdel Test Result",
subtitle = "Bechdel Test Results for 1500 IMDB records (1970 - 2013)",
title = "Lack of Real Portrayal of Women in Fiction") +
scale_fill_brewer(palette = "Set1") +
theme(legend.position = "none")
)
#Does Bechdel Test result change over time?
bech %>%
ggplot(aes(year, fill = binary, group = binary)) +
geom_density(position="fill") +
labs(title = "Portayal of Women in Fiction",
subtitle = "Some Improvement in 40 Years",
x = "Year",
y = 'Proportion Pass/Fail',
fill = "Bechdel Test") +
scale_y_continuous(labels = percent_format())
#Distribution of predictor variables.
bech %>%
select(-code) %>%
pivot_longer(cols = -c(year,imdb,binary), names_to = "Variables") %>%
ggplot() +
geom_histogram(aes(x = year, group = Variables, fill = Variables), bins = 43) +
facet_wrap(~ Variables) +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
labs(title = "Skewed Distributions")
#Predictors
bech %>%
select(-code) %>%
pivot_longer(cols = -c(year,imdb,binary), names_to = "Variables") %>%
mutate(value = log(value)) %>%
ggplot() +
geom_boxplot(aes(y = value, x = binary, group = binary, fill = binary)) +
facet_wrap( ~ Variables, scales = "free") +
labs(y = "US Dollars (logged)",
x = "Bechdel Test Result",
title = "More Money, Less Representation?",
subtitle = "Possible Correlation with Bechdel Test Failure and Money") +
theme(legend.position = "none") +
scale_y_continuous(labels = dollar_format())
#require(caret)
# Splitting the data into train and test
moddat = bech %>%
select(-imdb, -code) %>%
mutate(binary = ifelse(binary == "PASS", 1, 0))
# Training the model
logmodel <- glm(binary ~ ., family = binomial, moddat)
# Check out the model summary
summary(logmodel)
##
## Call:
## glm(formula = binary ~ ., family = binomial, data = moddat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4367 -1.1293 -0.8252 1.1676 2.3270
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.002e+01 2.072e+01 0.484 0.628701
## year -4.920e-03 1.034e-02 -0.476 0.634124
## budget 1.048e-08 8.600e-09 1.219 0.222930
## domgross 2.368e-08 7.173e-09 3.301 0.000964 ***
## intgross -9.630e-09 3.515e-09 -2.740 0.006145 **
## budget_2013 -1.594e-08 7.183e-09 -2.219 0.026468 *
## domgross_2013 -1.895e-08 5.531e-09 -3.427 0.000611 ***
## intgross_2013 8.216e-09 2.774e-09 2.961 0.003063 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2040.8 on 1483 degrees of freedom
## Residual deviance: 1977.3 on 1476 degrees of freedom
## AIC: 1993.3
##
## Number of Fisher Scoring iterations: 5
broom::tidy(logmodel)
## # A tibble: 8 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1.00e+1 2.07e+1 0.484 0.629
## 2 year -4.92e-3 1.03e-2 -0.476 0.634
## 3 budget 1.05e-8 8.60e-9 1.22 0.223
## 4 domgross 2.37e-8 7.17e-9 3.30 0.000964
## 5 intgross -9.63e-9 3.51e-9 -2.74 0.00615
## 6 budget_2013 -1.59e-8 7.18e-9 -2.22 0.0265
## 7 domgross_2013 -1.90e-8 5.53e-9 -3.43 0.000611
## 8 intgross_2013 8.22e-9 2.77e-9 2.96 0.00306
moddat$predicted_glm <- ifelse(logmodel$fitted.values >= 0.5, "PASS", "FAIL")
#How accurate is our model?
results_tab = table(moddat$binary, moddat$predicted_glm)
acc = sum(diag(results_tab))/sum(results_tab)*100
print(paste0("Accuracy: ",round(acc,2),"%"))
## [1] "Accuracy: 58.36%"
roc <- roc(moddat$binary, logmodel$fitted.values)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(paste0("AUC: ",round(auc(roc), 4)))
## [1] "AUC: 0.6162"
# Training the model
logmodel_2013 <- glm(binary ~ budget_2013 + domgross_2013 + intgross_2013, family = binomial, moddat)
# Check out the model summary
summary(logmodel_2013)
##
## Call:
## glm(formula = binary ~ budget_2013 + domgross_2013 + intgross_2013,
## family = binomial, data = moddat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3346 -1.1135 -0.8468 1.1904 1.8752
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.899e-01 8.118e-02 2.340 0.019296 *
## budget_2013 -6.356e-09 1.324e-09 -4.799 1.59e-06 ***
## domgross_2013 -4.267e-09 1.248e-09 -3.419 0.000628 ***
## intgross_2013 1.769e-09 5.870e-10 3.014 0.002576 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2040.8 on 1483 degrees of freedom
## Residual deviance: 1997.5 on 1480 degrees of freedom
## AIC: 2005.5
##
## Number of Fisher Scoring iterations: 4
broom::tidy(logmodel_2013)
## # A tibble: 4 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1.90e-1 8.12e- 2 2.34 0.0193
## 2 budget_2013 -6.36e-9 1.32e- 9 -4.80 0.00000159
## 3 domgross_2013 -4.27e-9 1.25e- 9 -3.42 0.000628
## 4 intgross_2013 1.77e-9 5.87e-10 3.01 0.00258
moddat$predicted_glm_2013 <- ifelse(logmodel_2013$fitted.values >= 0.5, "PASS", "FAIL")
#How accurate is our model?
results_tab = table(moddat$binary, moddat$predicted_glm_2013)
acc = sum(diag(results_tab))/sum(results_tab)*100
print(paste0("Accuracy: ",round(acc,2),"%"))
## [1] "Accuracy: 56.27%"
roc <- roc(moddat$binary, logmodel_2013$fitted.values)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(paste0("AUC: ",round(auc(roc), 4)))
## [1] "AUC: 0.5955"
logmodel_aic = MASS::stepAIC(logmodel)
## Start: AIC=1993.29
## binary ~ year + budget + domgross + intgross + budget_2013 +
## domgross_2013 + intgross_2013
##
## Df Deviance AIC
## - year 1 1977.5 1991.5
## - budget 1 1978.8 1992.8
## <none> 1977.3 1993.3
## - budget_2013 1 1982.6 1996.6
## - intgross 1 1985.8 1999.8
## - intgross_2013 1 1987.4 2001.4
## - domgross 1 1991.0 2005.0
## - domgross_2013 1 1993.4 2007.4
##
## Step: AIC=1991.52
## binary ~ budget + domgross + intgross + budget_2013 + domgross_2013 +
## intgross_2013
##
## Df Deviance AIC
## - budget 1 1978.9 1990.9
## <none> 1977.5 1991.5
## - budget_2013 1 1983.6 1995.6
## - intgross 1 1986.1 1998.1
## - intgross_2013 1 1987.8 1999.8
## - domgross 1 1992.5 2004.5
## - domgross_2013 1 1995.7 2007.7
##
## Step: AIC=1990.9
## binary ~ domgross + intgross + budget_2013 + domgross_2013 +
## intgross_2013
##
## Df Deviance AIC
## <none> 1978.9 1990.9
## - intgross 1 1986.1 1996.1
## - intgross_2013 1 1987.8 1997.8
## - domgross 1 1992.9 2002.9
## - domgross_2013 1 1996.0 2006.0
## - budget_2013 1 2007.1 2017.1
#What if we used a stepAIC'd model?
moddat$predicted_glm_aic <- ifelse(logmodel_aic$fitted.values >= 0.5, "PASS", "FAIL")
#How accurate is our model?
results_tab = table(moddat$binary, moddat$predicted_glm_aic)
acc = sum(diag(results_tab))/sum(results_tab)*100
print(paste0("Accuracy: ",round(acc,2),"%"))
## [1] "Accuracy: 58.89%"
roc <- roc(moddat$binary, logmodel_aic$fitted.values)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(paste0("AUC: ",round(auc(roc), 4)))
## [1] "AUC: 0.6173"
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
mmodel = lmer(binary ~ budget_2013 + domgross_2013 + intgross_2013 + (1 | year), data = moddat)
summary(mmodel)
## Linear mixed model fit by REML ['lmerMod']
## Formula: binary ~ budget_2013 + domgross_2013 + intgross_2013 + (1 | year)
## Data: moddat
##
## REML criterion at convergence: 2230
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.1841 -0.9370 -0.6088 1.0360 1.7887
##
## Random effects:
## Groups Name Variance Std.Dev.
## year (Intercept) 0.00202 0.04494
## Residual 0.23897 0.48885
## Number of obs: 1484, groups: year, 44
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 5.396e-01 2.154e-02 25.056
## budget_2013 -1.504e-09 2.968e-10 -5.069
## domgross_2013 -9.541e-10 2.771e-10 -3.443
## intgross_2013 4.015e-10 1.345e-10 2.985
##
## Correlation of Fixed Effects:
## (Intr) b_2013 d_2013
## budget_2013 -0.499
## dmgrss_2013 -0.348 0.288
## intgrs_2013 0.280 -0.495 -0.915
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
moddat$predicted_mmodel = predict(mmodel, re.form = NA)
moddat$predicted_mmodel <- ifelse(moddat$predicted_mmodel >= 0.5, "PASS", "FAIL")
#How accurate is our model?
results_tab = table(moddat$binary, moddat$predicted_mmodel)
acc = sum(diag(results_tab))/sum(results_tab)*100
print(paste0("Accuracy: ",round(acc,2),"%"))
## [1] "Accuracy: 56%"
Looks like the step AIC model is the best… Let’s use it.
test = read.csv('Test.csv') %>%
as_tibble(.)
test = test %>%
mutate(across(-c(imdb,code), ~as.numeric(.x))) %>%
filter(!is.na(domgross))
test$binary_predicted = predict(logmodel_aic, test, type = "response")
test = test %>%
mutate(binary_predicted = ifelse(binary_predicted > 0.5, "PASS", "FAIL"))
test %>%
count(binary_predicted)
## # A tibble: 2 x 2
## binary_predicted n
## <chr> <int>
## 1 FAIL 189
## 2 PASS 103
test %>%
count(binary_predicted) %>%
mutate(bech_percent = n / sum(n)) %>%
ggplot() +
geom_col(aes(x = binary_predicted, y = bech_percent, fill = binary_predicted)) +
scale_y_continuous(limits = c(0, 1),
labels = label_percent()) +
labs(y = "Predicted Bechdel Test Proportion (%)",
x = "Bechdel Test Result",
subtitle = "Predicted Bechdel Test Results for 1500 IMDB records (1970 - 2013)",
title = "Lack of Real Portrayal of Women in Fiction") +
scale_fill_brewer(palette = "Set1") +
theme(legend.position = "none")
#Does Bechdel Test result change over time?
test %>%
ggplot(aes(year, fill = binary_predicted, group = binary_predicted)) +
geom_density(position="fill") +
labs(title = "Predicted Portayal of Women in Fiction",
subtitle = "Some Improvement in 40 Years?",
x = "Year",
y = 'Predicted Proportion Pass/Fail',
fill = "Bechdel Test") +
scale_y_continuous(labels = percent_format())
write.csv(test, 'test_dataset_with_predictions.csv', row.names = F)